Part I - Machine Learning
Name: Andreas Eckmann
E-Mail: eckmanna@ethz.ch
from IPython.display import display
import pandas as pd
pd.options.display.max_columns = None # Display all columns of a dataframe
pd.options.display.max_rows = 700
from pprint import pprint
import time
import datetime
import os
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import sklearn
# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
#%matplotlib notebook
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings = lambda *a, **kw: None
# load the csv data from my online repository
data = pd.read_excel('https://raw.githubusercontent.com/eckmanna/big_data_policy_2020/master/course_project/data/df_pv_data_germany.xlsx').iloc[:, :] #avoiding "unnamed"-row
print(data.shape,
data.head())
data.dtypes
p1 = sns.lineplot(x = "Year", y="Installed_Cap_below_30_pc", hue="Bundesland", data = data)
#plot module-costs
ax2 = p1.twinx()
sns.lineplot(x = "Year", y="module_cost", color="black", dashes=True, data = data, ax = ax2)
ax2.spines['right'].set_position(('axes', 1.0))
ax2.lines[0].set_linestyle("--")
#plot feed-in
ax3 = p1.twinx()
ax3.spines['right'].set_position(('axes', 1.1))
sns.lineplot(x = "Year", y="feed-in", color="blue", data = data, ax = ax3)
ax3.lines[0].set_linestyle("--")
p1.legend(loc='center right', bbox_to_anchor=(1.5, 0.5), ncol=1)
p1.set(xlabel="Year",
ylabel="Installed PV Capacity per capita [W/cap]")
#title= "Annually Installed PC Capacity per capita per Bundesland [<30kW]")
plt.show()
p2 = sns.lineplot(x="Year", y="Cum_Installed_Cap_below_30_pc", hue="Bundesland", data = data)
p2.legend(loc='center right', bbox_to_anchor=(1.7, 0.5), ncol=1)
p2.set(xlabel="Year",
ylabel="Installed PV Capacity per capita [W/cap]")
#title= "Cumulative Installed PV Capacity per capita per Bundesland [<30kW]")
plt.show()
p3 = sns.lineplot(x="Year", y="GDP_pc", hue="Bundesland", data = data)
p3.legend(loc='center right', bbox_to_anchor=(1.7, 0.5), ncol=1)
p3.set(xlabel="Year",
ylabel="GDP per capita [EUR/cap]")
#title= "GDP per capita per Bundesland")
p4 = sns.lineplot(x="Year", y="pop_density", hue="Bundesland", data = data)
p4.legend(loc='center right', bbox_to_anchor=(1.7, 0.5), ncol=1)
p4.set(xlabel="Year",
ylabel="Population density [cap/km2]")
#title= "Population density per Bundesland")
p5 = sns.lineplot(x="Year", y="share_green_party", hue="Bundesland", data = data)
p5.legend(loc='center right', bbox_to_anchor=(1.7, 0.5), ncol=1)
p5.set(xlabel="Year",
ylabel="Share [-]")
#title= "Share of green party in the parliament")
# filter to only work with complete data (balanced panel)
df1 = data[(data.Year>=1992) & (data.Year<2020)]
df1.head()
df1.tail()
# extract list of bundeslaender
bundeslaender = df1['Bundesland'].unique()
# extract list of years
years = df1['Year'].unique()
print(bundeslaender)
print(bundeslaender.shape)
print(years)
print(years.shape)
The following independent variables are used for ML prediction:
National variables have the same values for all states (Bundesländer).
Sub-national variables describe regional differences and are unique for each state (Bundesland).
Variables not used due to incompleteness:
# X from datafram to array
X_full = df1[['Year','module_cost','feed-in','population','area','GDP_pc','solar_irradiance','sun_hours','share_green_party']].to_numpy()
# use both population and area, rather than population density only
X_full.shape
# Bundeslaender from dataframe to array
Bundesland_full = df1['Bundesland'].to_numpy()
print(Bundesland_full.shape)
The dependent outcome variable Y:
# Y from datafram to array
Y_full = df1['Installed_Cap_below_30_pc'].to_numpy()
Y_full.shape
# X from array to dataframe
X_df = pd.DataFrame(X_full, columns=['Year','module_cost','feed-in','population','area','GDP_pc','solar_irradiance','sun_hours','share_green_party'])
# Format year
X_df['Year'] = pd.to_datetime(X_df['Year'], format='%Y').dt.year
X_df.head()
#check for missing values
print("Missing X values:", X_df.isnull().sum())
print("Missing Y values:", df1['Installed_Cap_below_30_pc'].isnull().sum())
sns.set(rc={'figure.figsize':(12,12)})
correlation_matrix = X_df.corr().round(2)
sns.heatmap(correlation_matrix) #annot=True
plt.show()
print("X_full", X_full.shape)
print("Y_full", Y_full.shape)
X_df.head()
mask_year = X_full[:,0]<2000 #drop data before 2000 (feed-in was very low)
# optional: apply different masks
"""
mask_year = np.logical_or.reduce((X_full[:,0]<2000, #drop data before 2000
X_full[:,0]==2009,
X_full[:,0]==2010,
X_full[:,0]==2011))
"""
Y_full=Y_full[mask_year==False]
X_full=X_full[mask_year==False]
X_df=X_df[mask_year==False]
Bundesland_full=Bundesland_full[mask_year==False]
print(X_full.shape)
print(Y_full.shape)
print(Bundesland_full.shape)
data from years 2000 - 2019 (20 years)
train 80% or 16 years: from 2000 - 2015
test 20% or 4 years: from 2016 - 2019
# splitting the data
split = 2016
mask = X_full[:,0] >= split # TRUE for test data from split-year on
X_train = X_full[mask == False]
Y_train = Y_full[mask == False]
X_test = X_full[mask == True]
Y_test = Y_full[mask == True]
Bundesland_train = Bundesland_full[mask==False]
Bundesland_test = Bundesland_full[mask==True]
print("mask", mask.shape)
print("train data", X_train.shape, Y_train.shape)
print("test data", X_test.shape, Y_test.shape)
print("bundeslaender in train data", Bundesland_train.shape)
print("bundeslaender in test data", Bundesland_test.shape)
For plotting the results:
# create a new dataframe for plotting the results
Y = pd.DataFrame(df1, columns=['Year','Bundesland','Installed_Cap_below_30_pc'])
# create a new variable "Data" to classify the type of data
Y['Data'] = "$Y_i$ Installed Capacity"
Y = Y.rename(columns = {'Installed_Cap_below_30_pc':'Y'})
Y.head()
# function for plotting fti quality
def plot_fit_quality(values_test, predicted):
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
x = np.arange(len(predicted))
plt.scatter(x, predicted - values_test, color='steelblue', marker='o')
plt.plot([0, len(predicted)], [0, 0], "k:")
max_diff = np.max(np.abs(predicted - values_test))
plt.ylim([-max_diff, max_diff])
plt.ylabel("error")
plt.xlabel("sample id")
plt.subplot(1, 2, 2)
plt.scatter(x, (predicted - values_test) / values_test, color='steelblue', marker='o')
plt.plot([0, len(predicted)], [0, 0], "k:")
plt.ylim([-.5, .5])
plt.ylabel("relative error")
plt.xlabel("sample id")
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)
features = list(X_df.columns)
from sklearn.metrics import mean_squared_error
Y_train_pred = lin_reg.predict(X_train)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
Y_test_pred = lin_reg.predict(X_test)
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_rmse = np.sqrt(test_mse)
from sklearn.metrics import r2_score
r2 = round(r2_score(Y_test, Y_test_pred), 2)
from sklearn.metrics import explained_variance_score
exp_var = round(explained_variance_score(Y_test,Y_test_pred),2)
print('The coefficients of the features from the linear model:')
print(dict(zip(features, [round(x, 2) for x in lin_reg.coef_])))
print("Train RMS: %s" % train_rmse) # = np.sqrt(np.mean((predicted - expected) ** 2))
print("Test RMS: %s" % test_rmse)
print("R-squared for training dataset:{}".
format(np.round(lin_reg.score(X_train, Y_train), 2)))
print("Test R2: %s" % r2)
print("Explained variance: %s" % exp_var)
# Regplot
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([5, 20], [5, 20], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1)
ridge_reg.fit(X_train, Y_train)
Y_train_pred = ridge_reg.predict(X_train)
Y_test_pred = ridge_reg.predict(X_test)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 4))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 4))
r2 = r2=round(r2_score(Y_test, Y_test_pred), 2)
# Regplot (code taken from the lecture Notebook W3)
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([10, 20], [10, 20], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
from sklearn.preprocessing import PolynomialFeatures
poly_features=PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)
lin_reg = LinearRegression()
lin_reg.fit(X_train_poly, Y_train)
Y_train_pred = lin_reg.predict(X_train_poly)
Y_test_pred = lin_reg.predict(X_test_poly)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test,Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 2))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 2))
r2 = r2=round(r2_score(Y_test, Y_test_pred), 2)
# Regplot
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([0, 17.5], [0, 17.5], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
from sklearn.linear_model import Lasso
lasso_reg=Lasso(alpha=1)
lasso_reg.fit(X_train, Y_train)
Y_train_pred = lasso_reg.predict(X_train)
Y_test_pred = lasso_reg.predict(X_test)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 2))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 2))
r2 = r2=round(r2_score(Y_test, Y_test_pred), 2)
# Regplot
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([10, 25], [10, 25], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
from sklearn.linear_model import ElasticNet
elanet_reg=ElasticNet(random_state=0)
elanet_reg.fit(X_train, Y_train)
Y_train_pred = elanet_reg.predict(X_train)
Y_test_pred = elanet_reg.predict(X_test)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 2))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 2))
r2 = r2=round(r2_score(Y_test, Y_test_pred), 2)
# Regplot
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([10, 25], [10, 25], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
A different approach using shifted feed-in data. The feed-in was shifted +6 years so that the feed-in peak coincides with the annually installed capacity peak (see graph below). This approach takes the delay between policy adaptation and commissioning of the PV modules on the roof-top.
# load the csv data from my online repository
data_shifted = pd.read_excel('https://raw.githubusercontent.com/eckmanna/big_data_policy_2020/master/course_project/data/df_pv_data_germany_fit-shifted.xlsx').iloc[:, :] #avoiding "unnamed"-row
df1_shifted = data_shifted[(data_shifted.Year>=1992) & (data_shifted.Year<2020)]
# X from datafram to array
X_full = df1_shifted[['Year','module_cost','feed-in','population','area','GDP_pc','solar_irradiance','sun_hours','share_green_party']].to_numpy()
# use both population and area, rather than population density only
print(X_full.shape)
# Y from datafram to array
Y_full = df1_shifted['Installed_Cap_below_30_pc'].to_numpy()
print(Y_full.shape)
# Bundeslaender from dataframe to array
Bundesland_full = df1_shifted['Bundesland'].to_numpy()
print(Bundesland_full.shape)
# X from array to dataframe
X_df = pd.DataFrame(X_full, columns=['Year','module_cost','feed-in','population','area','GDP_pc','solar_irradiance','sun_hours','share_green_party'])
#X_df = pd.DataFrame(X_full, columns=['Year','module_cost','feed-in','solar_irradiance','sun_hours','share_green_party'])
# Format year
X_df['Year'] = pd.to_datetime(X_df['Year'], format='%Y').dt.year
X_df.head()
p1 = sns.lineplot(x = "Year", y="Installed_Cap_below_30_pc", hue="Bundesland", data = data_shifted)
#plot module-costs
ax2 = p1.twinx()
sns.lineplot(x = "Year", y="module_cost", color="black", dashes=True, data = data_shifted, ax = ax2)
ax2.spines['right'].set_position(('axes', 1.0))
ax2.lines[0].set_linestyle("--")
#plot feed-in
ax3 = p1.twinx()
ax3.spines['right'].set_position(('axes', 1.1))
sns.lineplot(x = "Year", y="feed-in", color="blue", data = data_shifted, ax = ax3)
ax3.lines[0].set_linestyle("--")
p1.legend(loc='center right', bbox_to_anchor=(1.5, 0.5), ncol=1)
p1.set(xlabel="Year",
ylabel="Installed PV Capacity per capita [W/cap]")
#title= "Annually Installed PC Capacity per capita per Bundesland [<30kW]")
plt.show()
mask_year = np.logical_or.reduce((X_full[:,0]<1998, #drop data before 1998
X_full[:,0]>2019))
Y_full=Y_full[mask_year==False]
X_full=X_full[mask_year==False]
X_df=X_df[mask_year==False]
Bundesland_full=Bundesland_full[mask_year==False]
print(X_full.shape)
print(Y_full.shape)
print(Bundesland_full.shape)
Data Split:
data from years 1998 - 2019 (22 years)
train 45% or 10 years: from 1998 - 2007
test 55% or 12 years: from 2008 - 2019
# splitting the data in 2008
split = 2008
mask = X_full[:,0] >= split # TRUE for test data from split-year on
X_train = X_full[mask == False]
Y_train = Y_full[mask == False]
X_test = X_full[mask == True]
Y_test = Y_full[mask == True]
Bundesland_train = Bundesland_full[mask==False]
Bundesland_test = Bundesland_full[mask==True]
print("mask", mask.shape)
print("train data", X_train.shape, Y_train.shape)
print("test data", X_test.shape, Y_test.shape)
print("bundeslaender in train data", Bundesland_train.shape)
print("bundeslaender in test data", Bundesland_test.shape)
from sklearn.preprocessing import PolynomialFeatures
poly_features=PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)
lin_reg = LinearRegression()
lin_reg.fit(X_train_poly, Y_train)
Y_train_pred = lin_reg.predict(X_train_poly)
Y_test_pred = lin_reg.predict(X_test_poly)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test,Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 2))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 2))
r2 = r2=round(r2_score(Y_test, Y_test_pred), 2)
# Regplot
g=sns.regplot(x= Y_test_pred, y=Y_test, x_bins = 100)
g=g.set_title("Test sample")
plt.xlabel("Predicted Installed Capacity per capita: $\hat{Y}_i$")
plt.ylabel("Installed Capacity per capita: $Y_i$")
plt.annotate('R2={}'.format(r2),
xy=(1, 0), xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='bottom')
plt.annotate('Notes: 100 binned',
xy=(0, 0), xycoords='figure fraction',
horizontalalignment='left',
verticalalignment='bottom')
plt.plot([0, 100], [0, 100], '--k')
plt.axis('tight')
plt.tight_layout()
plt.show(g)
plot_fit_quality(Y_test, Y_test_pred)
# extract the predicted data into a dataframe
df_pred = np.column_stack((Bundesland_test,X_test[:,0],Y_test,Y_test_pred))
df_pred = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test','Y_test_pred'])
# adjust data types
df_pred['Year'] = df_pred['Year'].astype(int)
df_pred['Y_test'] = df_pred['Y_test'].astype(float)
df_pred['Y_test_pred'] = df_pred['Y_test_pred'].astype(float)
# create a new dataframe for plotting the results
df_pred_plot = pd.DataFrame(df_pred, columns=['Bundesland','Year','Y_test_pred'])
# create a new variable "Data" to classify the type of data
df_pred_plot['Data'] = "$\hat{Y}_i$ Predicted Installed Capacity"
df_pred_plot = df_pred_plot.rename(columns = {'Y_test_pred':'Y'})
# merge two data frames with actual and predicted data together
df_plot = pd.merge(Y, df_pred_plot , how='outer', on=['Year','Bundesland','Y','Data'])
df_plot = df_plot.sort_values(by=['Year','Bundesland'])
# plot the results
ax = sns.lineplot(x="Year", y="Y", hue="Data", style="Data", markers=True, dashes=False,
data=df_plot)
ax.set(xlabel="Year",
ylabel="Installed Capacity per capita [W/cap]")
#title= "Annually Installed PV Capacity per capita in Germany [<30kW]")
# apply different masks
mask_year = np.logical_or.reduce((X_full[:,0]<1998, #drop data before 1998
X_full[:,0]>2015))
Y_full=Y_full[mask_year==False]
X_full=X_full[mask_year==False]
X_df=X_df[mask_year==False]
Bundesland_full=Bundesland_full[mask_year==False]
print(X_full.shape)
print(Y_full.shape)
print(Bundesland_full.shape)
Data Split:
data from years 1998 - 2015 (18 years)
train 55% or 10 years: from 1998 - 2007
test 45% or 8 years: from 2008 - 2015
# splitting the data
split = 2008
mask = X_full[:,0] >= split # TRUE for test data from split-year on
X_train = X_full[mask == False]
Y_train = Y_full[mask == False]
X_test = X_full[mask == True]
Y_test = Y_full[mask == True]
Bundesland_train = Bundesland_full[mask==False]
Bundesland_test = Bundesland_full[mask==True]
print("mask", mask.shape)
print("train data", X_train.shape, Y_train.shape)
print("test data", X_test.shape, Y_test.shape)
print("bundeslaender in train data", Bundesland_train.shape)
print("bundeslaender in test data", Bundesland_test.shape)
from sklearn.preprocessing import PolynomialFeatures
poly_features=PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)
lin_reg = LinearRegression()
lin_reg.fit(X_train_poly, Y_train)
Y_train_pred = lin_reg.predict(X_train_poly)
Y_test_pred = lin_reg.predict(X_test_poly)
train_mse = mean_squared_error(Y_train, Y_train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(Y_test,Y_test_pred)
test_rmse = np.sqrt(test_mse)
print("train RMS: %s" % train_rmse)
print("test RMS: %s" % test_rmse)
print("train R2: %s" % round(r2_score(Y_train, Y_train_pred), 2))
print("test R2: %s" % round(r2_score(Y_test, Y_test_pred), 2))